import sys
!{sys.executable} -m pip install plotly
!{sys.executable} -m pip install Dash
!{sys.executable} -m pip install jupyter-dash
Requirements already satisfied
import sys
!{sys.executable} -m pip install hide_code
!{sys.executable} -m jupyter nbextension install --py --user hide_code
!{sys.executable} -m jupyter nbextension enable --py --user hide_code
!{sys.executable} -m jupyter serverextension enable --py --user hide_code
Requirements already satisfied
Installing C:\Users\tomas\anaconda3\lib\site-packages\hide_code\ done
import pandas as pd
import numpy as np
import jupyter_dash as JupyterDash
import plotly.express as px
import os
current_path = os.getcwd()
files = os.listdir(current_path + '\\Data\\')
df = pd.DataFrame()
for file in files:
temp = pd.read_csv(current_path + '\\Data\\'+ file)
temp['file_name'] = file
df = pd.concat([df,temp])
df[['city','type']] = df.file_name.str.split('_', expand = True)
df.type = df.type.str[:-4]
df = df.drop(columns = 'file_name')
print(df.columns)
print(len(df))
Index(['Unnamed: 0', 'realSum', 'room_type', 'room_shared', 'room_private',
'person_capacity', 'host_is_superhost', 'multi', 'biz',
'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms', 'dist',
'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index',
'rest_index_norm', 'lng', 'lat', 'city', 'type'],
dtype='object')
51707
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook_connected"
fig1 = go.Figure()
cities = df["city"].drop_duplicates()
fig1.add_trace(go.Violin(x=df['city'][df['type'] == 'weekdays'],
y=df['realSum'][df['type'] == 'weekdays'],
legendgroup='weekdays',
name='weekdays',
side='negative',
line_color='blue',
box_visible=True,
meanline_visible=True))
fig1.add_trace(go.Violin(x=df['city'][df['type'] == 'weekends'],
y=df['realSum'][df['type'] == 'weekends'],
legendgroup='weekends',
name='weekends',
side='positive',
line_color='orange',
box_visible=True,
meanline_visible=True))
#added spanmode to limit the density (before it was showing negative values due to kernel density estimation)
fig1.update_traces(meanline_visible=True, spanmode = 'hard')
fig1.update_layout(violingap=0, violingroupgap=0, violinmode='overlay', title ='AirBnB price distribution per city and weekday/weekend')
fig1.show()
We can see that outliers of expensive AirBnB locations make the graph less readable.
However, because the graph is interactive we can zoom in on the section of the graph that doesn't include outliers by selecting specific area on the graph.
However let's plot the same graph after dropping outliers with values that are in the 0.5% of top values
# search for outliers, which are largely represented at price data
for city in df.city.unique():
df.loc[df.city == city, 'outliers'] = df.loc[df.city == city, 'realSum'] > df.loc[df.city == city, 'realSum'].quantile(0.995)
dflen1=len(df)
df1 = df.drop(df[df.outliers == True].index)
dflen2=len(df1)
print(f'Dropped ' + str(dflen1-dflen2) + ' outliers')
Dropped 3636 outliers
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook_connected"
fig1 = go.Figure()
cities = df1["city"].drop_duplicates()
fig1.add_trace(go.Violin(x=df1['city'][df1['type'] == 'weekdays'],
y=df1['realSum'][df1['type'] == 'weekdays'],
legendgroup='weekdays',
name='weekdays',
side='negative',
line_color='blue',
box_visible=True,
meanline_visible=True))
fig1.add_trace(go.Violin(x=df1['city'][df1['type'] == 'weekends'],
y=df1['realSum'][df1['type'] == 'weekends'],
legendgroup='weekends',
name='weekends',
side='positive',
line_color='orange',
box_visible=True,
meanline_visible=True))
#added spanmode to limit the density (before it was showing negative values due to kernel density estimation)
fig1.update_traces(meanline_visible=True, spanmode = 'hard')
fig1.update_layout(violingap=0, violingroupgap=0, violinmode='overlay', title ='AirBnB price distribution per city and weekday/weekend')
fig1.show()
Let's create an animated graph which is packaged into a Dash App & deployed in GCP.
df4 = pd.pivot_table(df, values = ['realSum','guest_satisfaction_overall'],index=['person_capacity','city'], aggfunc=np.mean)
df5=df4.reset_index()
df5.to_csv(file,index=False)
df6=pd.read_csv('vienna_weekends.csv')
df6.head()
| person_capacity | city | guest_satisfaction_overall | realSum | |
|---|---|---|---|---|
| 0 | 2.0 | amsterdam | 94.380914 | 417.738213 |
| 1 | 2.0 | athens | 94.278374 | 131.160254 |
| 2 | 2.0 | barcelona | 91.584144 | 226.304795 |
| 3 | 2.0 | berlin | 95.088330 | 202.941984 |
| 4 | 2.0 | budapest | 94.459916 | 139.075117 |
#setup which works only works locally, we had to deploy it online
app3 = JupyterDash(__name__)
app3.layout = html.Div([
html.H4('Animated AirBnB Prices'),
html.P("Select an animation:"),
dcc.RadioItems(
id='selection',
options=["Bar","Scatter"],
value='Bar',
),
dcc.Loading(dcc.Graph(id="graph"), type="cube")
])
@app3.callback(
Output("graph", "figure"),
Input("selection", "value"))
def display_animated_graph(selection):
animations = {'Scatter': px.scatter(
df6, x="guest_satisfaction_overall", y="realSum", animation_frame="person_capacity",
animation_group="city", size='realSum', color="city",
hover_name="realSum", size_max=50,
range_x=[80,110], range_y=[0,2000]),
'Bar': px.bar(
df6, x="city", y="realSum",
animation_frame="person_capacity", animation_group="city",
range_y=[0,2000]),
}
return animations[selection]
app3.run_server(mode='inline',debug=True, port=1235)
App has been deployed on Google Cloud Platform via Docker.
Let's create a map of AirBnB locations in Rome and visually show how far the locations are from the closest metro station.
px.set_mapbox_access_token('myaccesstoken')
rome = df[(df["city"]=='rome') & (df["type"]=='weekdays')]
First, let's understand the distribution of distances in order to define our buckets. (adding a breakdown regarding superhost status just for fun.)
import plotly.express as px
fig = px.histogram(rome, x="metro_dist", color='host_is_superhost', marginal="rug", # can be `box`, `violin`
hover_data=rome.columns)
fig.show()
import warnings
warnings.filterwarnings("ignore")
# Creating a new column with bins showing the distance from the nearest metro station
cut_labels_5 = ['Very Close (<0.25km)', 'Close (0.25-0.5km)', 'Walkable (0.5km-1.25km)', 'Far (1.25km-2.5km)', 'Very Far (>2.5km)']
cut_bins = [0, 0.25, 0.5, 1.25, 2.5, 5]
rome['Distance'] = pd.cut(rome['metro_dist'], bins=cut_bins, labels=cut_labels_5)
The map shows all weekday AirBnB locations in Rome with different colors based on the distance from the closest metro station. The size of each bubble coresponds to the price of the AirBnB. You can hover over each value for more information.
import plotly.express as px
fig = px.scatter_mapbox(rome, lat='lat', lon='lng', color="Distance", size="realSum",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10,
category_orders={"Distance": ['Very Close (<0.25km)', 'Close (0.25-0.5km)', 'Walkable (0.5km-1.25km)', 'Far (1.25km-2.5km)', 'Very Far (>2.5km)']})
fig.show()